library(dplyr)
library(data.table)
library(forcats)
library(glmnet)
library(pROC)
library(knitr)
library(caret)
library(boot)
library(ggplot2)
library(plotly)Project Part 3
Group 9
Loading libraries
Loading dataset
dataset <- fread("SMARTc.csv", sep = ";")Re-encode the categorical variables
dataset <- mutate(dataset,
EVENT = factor(EVENT),
EVENT = fct_recode(EVENT, "no" = "0", "yes" = "1"),
SEX = factor(SEX),
SEX = fct_recode(SEX, "male" = "1", "female" = "2"),
DIABETES = factor(DIABETES),
DIABETES = fct_recode(DIABETES, "no" = "0", "yes" = "1"),
SMOKING = factor(SMOKING),
SMOKING = fct_recode(SMOKING, "never" = "1", "former" = "2", "current" = "3"),
alcohol = factor(alcohol),
alcohol = fct_recode(alcohol, "never" = "1", "former" = "2", "current" = "3"),
CEREBRAL = factor(CEREBRAL),
CEREBRAL = fct_recode(CEREBRAL, "no" = "0", "yes" = "1"),
CARDIAC = factor(CARDIAC),
CARDIAC = fct_recode(CARDIAC, "no" = "0", "yes" = "1"),
AAA = factor(AAA),
AAA = fct_recode(AAA, "no" = "0", "yes" = "1"),
PERIPH = factor(PERIPH),
PERIPH = fct_recode(PERIPH, "no" = "0", "yes" = "1"),
albumin = factor(albumin),
albumin = fct_recode(albumin, "no" = "1", "micro" = "2", "macro" = "3"),
STENOSIS = factor(STENOSIS),
STENOSIS = fct_recode(STENOSIS, "no" = "0", "yes" = "1"),
)Lasso regression model
set.seed(123)
# p.278
formula <- EVENT ~ AGE + SEX + BMI + SYSTH + HDL + DIABETES +
+HOMOC + log(CREAT) + STENOSIS + IMT + SMOKING +
alcohol + albumin
# Splitting the data
train_sample <- sample(1:nrow(dataset), nrow(dataset) * 0.7)
train_data <- dataset[train_sample, ]
test_data <- dataset[-train_sample, ]
# with grid search for finetuning
grid <- 10 ^ seq(10, -2, length = 100)
lasso.mod <- cv.glmnet(
x = model.matrix(formula, data = train_data)[, -1],
y = as.numeric(train_data$EVENT) - 1,
alpha = 1,
family = "binomial",
nfolds = 10,
lambda = grid
)
cv.out <- cv.glmnet(
x = model.matrix(formula, data = train_data)[, -1],
y = as.numeric(train_data$EVENT) - 1,
alpha = 1,
family = "binomial",
nfolds = 10,
lambda = grid
)
best_lambda <- cv.out$lambda.min
lasso.pred <- predict(lasso.mod,
s = best_lambda,
newx = model.matrix(formula, data = test_data)[, -1], type = "response"
)
# Mean squared error
mean((lasso.pred - (as.numeric(test_data$EVENT) - 1))^2)[1] 0.1023319
plot(cv.out)Final model
whole_data <- dataset
grid <- 10^seq(10, -2, length = 100)
lasso_final.mod <- cv.glmnet(
x = model.matrix(formula, data = whole_data)[, -1],
y = as.numeric(whole_data$EVENT) - 1,
alpha = 1,
family = "binomial",
nfolds = 10,
lambda = grid
)
cv.out <- cv.glmnet(
x = model.matrix(formula, data = whole_data)[, -1],
y = as.numeric(whole_data$EVENT) - 1,
alpha = 1,
family = "binomial",
nfolds = 10,
lambda = grid
)
best_lambda <- cv.out$lambda.min
lasso_final.pred <- predict(lasso_final.mod,
s = best_lambda,
newx = model.matrix(formula, data = whole_data)[, -1], type = "response"
)
# Mean squared error
mean((lasso_final.pred - (as.numeric(whole_data$EVENT) - 1))^2)[1] 0.09592532
Performance of the model
roc_lasso <- roc(test_data$EVENT, lasso.pred)
auc_lasso <- auc(roc_lasso)
roc_lasso_final <- roc(whole_data$EVENT, lasso_final.pred)
auc_lasso_final <- auc(roc_lasso_final)display code to plot the ROC curve
roc_data <- data.frame(
Sensitivity = c(roc_lasso_final$sensitivities, roc_lasso$sensitivities),
Specificity = c(1 - roc_lasso_final$specificities, 1 - roc_lasso$specificities),
Model = c(rep("Lasso Final Model", length(roc_lasso_final$sensitivities)), rep("Lasso Train-Test", length(roc_lasso$sensitivities)))
)
roc_plot <- ggplot(roc_data, aes(x = Specificity, y = Sensitivity, color = Model)) +
geom_line(linewidth = 0.8) +
geom_segment(x = 0, xend = 1, y = 0, yend = 1, linetype = "dashed", color = "gray") +
theme_minimal(base_size = 14) +
labs(
title = "ROC Curve for Lasso Model",
x = "1 - Specificity",
y = "Sensitivity"
) +
theme(
plot.title = element_text(hjust = 0.5, size = 15, face = "bold"),
legend.position = "right"
) +
coord_equal()
ggplotly(roc_plot)print(auc_lasso)Area under the curve: 0.7096
print(auc_lasso_final)Area under the curve: 0.7302
We found an AUC of around 0.73 for the Lasso model. This means that the model is able to distinguish between patients with and without an event 73% of the time.
lasso_coefficients <- coef(cv.out, s = "lambda.min")
lasso_coefficients_df <- as.data.frame(as.matrix(lasso_coefficients))
lasso_coefficients_df$variable <- rownames(lasso_coefficients_df)
colnames(lasso_coefficients_df) <- c("coefficient", "variable")
# Filter out variables with non-zero coefficients
important_variables <- subset(lasso_coefficients_df, coefficient != 0)
kable(important_variables)| coefficient | variable | |
|---|---|---|
| (Intercept) | -6.3929584 | (Intercept) |
| AGE | 0.0279864 | AGE |
| BMI | -0.0099085 | BMI |
| SYSTH | 0.0019984 | SYSTH |
| HDL | -0.5547503 | HDL |
| HOMOC | 0.0346896 | HOMOC |
| log(CREAT) | 0.4870924 | log(CREAT) |
| STENOSISyes | 0.4348631 | STENOSISyes |
| IMT | 0.4020493 | IMT |
| SMOKINGformer | 0.1024396 | SMOKINGformer |
| SMOKINGcurrent | -0.0331973 | SMOKINGcurrent |
| albuminmacro | 0.2655378 | albuminmacro |
We see that some variables have positive coefficiens, while others have negative coefficients. This means that the model considers some variables as positively correlated with the event (albuminmacro, IMT, STENOSISyes, log(CREAT), DIABETESyes, SMOKINGformer), while others are negatively correlated (HDL, SMOKINGcurrent, alcoholcurrent). Also, some variables like AGE or HOMOC have smaller coefficients, but this can be due to the unit of the variable (e.g. AGE is in years, HOMOC is in µmol/L).
Stepwise backward variable selection
k <- 10
folds <- createFolds(dataset$EVENT, k = k, list = TRUE, returnTrain = FALSE)
roc_list <- list()
auc_values <- numeric(k)
for (i in 1:k) {
train_data <- dataset[-folds[[i]], ]
test_data <- dataset[folds[[i]], ]
stepwise_backward <- step(
glm(
formula,
data = train_data,
family = "binomial"
),
direction = "backward"
)
stepwise_backward_pred <- predict(
stepwise_backward,
newdata = test_data,
type = "response"
)
roc_i <- roc(test_data$EVENT, stepwise_backward_pred)
roc_list[[i]] <- roc_i
auc_values[i] <- auc(roc_i)
}display code to plot the ROC curves
roc_df <- do.call(rbind, lapply(1:length(roc_list), function(i) {
data.frame(
Fold = paste("Fold", i),
Sensitivity = roc_list[[i]]$sensitivities,
Specificity = 1 - roc_list[[i]]$specificities
)
}))
roc_plot_stepwise_back_cv <- ggplot(roc_df, aes(x = Specificity, y = Sensitivity, color = Fold)) +
geom_line(linewidth = 0.8) +
geom_segment(x = 0, xend = 1, y = 0, yend = 1, linetype = "dashed", color = "gray") +
scale_color_brewer(palette = "Set3") +
theme_minimal(base_size = 14) +
labs(
title = "ROC Curves for Each Fold, \n Stepwise backward variable selection - Cross validation",
x = "1 - Specificity",
y = "Sensitivity",
color = "Fold"
) +
theme(
plot.title = element_text(hjust = 0.5, size = 13, face = "bold"),
legend.position = "bottom"
) +
coord_equal()
ggplotly(roc_plot_stepwise_back_cv)auc_table <- data.frame(Iteration = 1:k, AUC = auc_values)
mean_auc <- mean(auc_values)
auc_table$AUC <- round(auc_table$AUC, 4)
mean_auc <- round(mean_auc, 4)
auc_table <- rbind(auc_table, c("AUC mean", mean_auc))
kable(auc_table, caption = "AUC Values for 10-Fold Cross-Validation")| Iteration | AUC |
|---|---|
| 1 | 0.6939 |
| 2 | 0.7062 |
| 3 | 0.7555 |
| 4 | 0.7387 |
| 5 | 0.7378 |
| 6 | 0.7416 |
| 7 | 0.7279 |
| 8 | 0.6791 |
| 9 | 0.7221 |
| 10 | 0.7377 |
| AUC mean | 0.724 |
(auc_confidence_interval = t.test(auc_values)$conf.int)[1] 0.7069292 0.7411659
attr(,"conf.level")
[1] 0.95
Final model
stepwise_backward <- step(
glm(
formula,
data = whole_data,
family = "binomial"
),
direction = "backward"
)
stepwise_backward_pred <- predict(
stepwise_backward,
newdata = whole_data,
type = "response"
)
roc_stepwise_backward <- roc(whole_data$EVENT, stepwise_backward_pred)
auc_stepwise_backward <- auc(roc_stepwise_backward)display code to plot the ROC curve
roc_plot_stepwise_final <- ggplot(data.frame(
Sensitivity = roc_stepwise_backward$sensitivities,
Specificity = 1 - roc_stepwise_backward$specificities
), aes(x = Specificity, y = Sensitivity)) +
geom_line(linewidth = 0.8, color = "#3459E6") +
geom_segment(x = 0, xend = 1, y = 0, yend = 1, linetype = "dashed", color = "gray") +
theme_minimal(base_size = 14) +
labs(
title = "ROC Curve final model \n Stepwise backward variable selection",
x = "1 - Specificity",
y = "Sensitivity"
) +
theme(
plot.title = element_text(hjust = 0.5, size = 15, face = "bold")
) +
coord_equal()
ggplotly(roc_plot_stepwise_final)display code to plot the ROC curve
print(auc_stepwise_backward)Area under the curve: 0.7357
stepwise_backward_variables <- stepwise_backward$anova
stepwise_backward_variablesThe variables included in the final model are SEX, DIABETES and alcohol. SEX has the smallest contribution to the deviance (0.0475), which suggests it’s the least influential in terms of explaining the variability in the outcome. DIABETES has a larger deviance (1.35), making it more important than SEX. Alcohol has the largest deviance (3.91), indicating that it explains the largest portion of the model’s fit.